In [ ]:
import numpy as np
import pandas as pd

import seaborn as sns #数据可视化
import matplotlib.pyplot as plt
%matplotlib inline
from ydata_profiling import ProfileReport #数据EDA交互工具 EDA(Exploratory Data Analysis)数据探索性分析

from sklearn.model_selection import train_test_split #切割数据集

from sklearn.linear_model import LogisticRegression #逻辑回归
from sklearn.ensemble import RandomForestClassifier #随机森林
from sklearn.svm import SVC, LinearSVC #支持向量机
from sklearn.neighbors import KNeighborsClassifier #k近邻
from sklearn.naive_bayes import GaussianNB #朴素贝叶斯
from sklearn.linear_model import Perceptron #感知机
from sklearn.linear_model import SGDClassifier #随机梯度下降
from sklearn.tree import DecisionTreeClassifier #决策树

from sklearn.preprocessing import LabelEncoder #分类变量编码

from sklearn.model_selection import cross_val_score #模型评估
In [ ]:
df = pd.read_csv('data/train.csv')

1.1 处理缺失值和重复值¶

In [ ]:
# 查看缺失值
df.isna().sum()
Out[ ]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

法一:¶

  • 对分类变量缺失值:填充某个缺失值字符(NA)、用最多类别的进行填充
  • 对连续变量缺失值:填充均值、中位数、众数
In [ ]:
# 对连续变量进行填充 用均值填充
df['Age'] = df['Age'].fillna(df['Age'].mean())
In [ ]:
# 查看数据分布情况
df['Cabin'].value_counts()
Out[ ]:
B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: Cabin, Length: 147, dtype: int64
In [ ]:
df['Embarked'].value_counts()
Out[ ]:
S    644
C    168
Q     77
Name: Embarked, dtype: int64
In [ ]:
# 对分类变量进行填充 填充某个缺失值字符(NA)、用最多类别的进行填充
df['Cabin'] = df['Cabin'].fillna('NA')
df['Embarked'] = df['Embarked'].fillna('S')
In [ ]:
# 查看重复值个数
df.duplicated().sum()
Out[ ]:
0

法二:¶

In [ ]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
combine = [train_df, test_df]

查看数据整体分布情况

  • 数值型 .describe()
  • 分类型 .describe(include = ['0'])
In [ ]:
train_df.columns
Out[ ]:
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
In [ ]:
train_df.describe()
Out[ ]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
In [ ]:
train_df.describe(include=['O'])
Out[ ]:
Name Sex Ticket Cabin Embarked
count 891 891 891 204 889
unique 891 2 681 147 3
top Braund, Mr. Owen Harris male 347082 B96 B98 S
freq 1 577 7 4 644
In [ ]:
print(train_df.isna().sum())
print('_'*40)
print(test_df.isna().sum())
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
________________________________________
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
  • Age:根据Sex,Pclass分类后的中位数来填充
In [ ]:
grid = sns.FacetGrid(train_df, row='Pclass', col='Sex', height=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend()
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x22eab933160>
In [ ]:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
In [ ]:
guess_ages = np.zeros((2,3))
for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & \
                (dataset['Pclass'] == j+1)]['Age'].dropna()

            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)
In [ ]:
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)
Out[ ]:
AgeBand Survived
0 (-0.08, 16.0] 0.550000
1 (16.0, 32.0] 0.337374
2 (32.0, 48.0] 0.412037
3 (48.0, 64.0] 0.434783
4 (64.0, 80.0] 0.090909
In [ ]:
train_df.Fare
Out[ ]:
0       7.2500
1      71.2833
2       7.9250
3      53.1000
4       8.0500
        ...   
886    13.0000
887    30.0000
888    23.4500
889    30.0000
890     7.7500
Name: Fare, Length: 891, dtype: float64

1.2 数据可视化¶

In [ ]:
df.head(3)
Out[ ]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NA S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NA S
In [ ]:
df.columns
Out[ ]:
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
In [ ]:
df.groupby(['Pclass','Survived'])['Survived'].count().unstack().plot(kind='bar',stacked='True')
Out[ ]:
<Axes: xlabel='Pclass'>
In [ ]:
df.groupby(['Sex','Survived'])['Survived'].count().unstack().plot(kind='bar',stacked='True')
Out[ ]:
<Axes: xlabel='Sex'>
In [ ]:
df.groupby(['SibSp','Survived'])['Survived'].count().unstack().plot(kind='bar',stacked='True')
Out[ ]:
<Axes: xlabel='SibSp'>
In [ ]:
df.groupby(['Parch','Survived'])['Survived'].count().unstack().plot(kind='bar',stacked='True')
Out[ ]:
<Axes: xlabel='Parch'>
In [ ]:
df.groupby(['Embarked','Survived'])['Survived'].count().unstack().plot(kind='bar',stacked='True')
Out[ ]:
<Axes: xlabel='Embarked'>
In [ ]:
g = sns.FacetGrid(df, col='Survived')
g.map(plt.hist, 'Age', bins=20)
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x22ea580afb0>
In [ ]:
facet = sns.FacetGrid(df, hue="Survived",aspect=3)
facet.map(sns.kdeplot,'Age',shade= True)
facet.set(xlim=(0, df['Age'].max()))
facet.add_legend()
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\axisgrid.py:848: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  func(*plot_args, **plot_kwargs)
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\axisgrid.py:848: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  func(*plot_args, **plot_kwargs)
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x22e841f6a10>
In [ ]:
facet = sns.FacetGrid(df, hue="Survived",aspect=3)
facet.map(sns.kdeplot,'Fare',shade= True)
facet.set(xlim=(0, df['Fare'].max()))
facet.add_legend()
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\axisgrid.py:848: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  func(*plot_args, **plot_kwargs)
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\seaborn\axisgrid.py:848: FutureWarning: 

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  func(*plot_args, **plot_kwargs)
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x1f39775c4f0>
In [ ]:
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
C:\Users\XLL\AppData\Local\Temp\ipykernel_11864\2675011510.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.heatmap(df.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
Out[ ]:
<Axes: >
In [ ]:
profile = ProfileReport(df)
profile
Summarize dataset: 100%|██████████| 47/47 [00:05<00:00,  7.95it/s, Completed]                       
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.44s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.58s/it]
Out[ ]:

1.3 创造新变量¶

  • 提取部分字符串:把Name中的Title提取出来
In [ ]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train_df['Title'], train_df['Sex'])
Out[ ]:
Sex female male
Title
Capt 0 1
Col 0 2
Countess 1 0
Don 0 1
Dr 1 6
Jonkheer 0 1
Lady 1 0
Major 0 2
Master 0 40
Miss 182 0
Mlle 2 0
Mme 1 0
Mr 0 517
Mrs 125 0
Ms 1 0
Rev 0 6
Sir 0 1
In [ ]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()
Out[ ]:
Title Survived
0 Master 0.575000
1 Miss 0.702703
2 Mr 0.156673
3 Mrs 0.793651
4 Rare 0.347826
In [ ]:
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)
Out[ ]:
FamilySize Survived
3 4 0.724138
2 3 0.578431
1 2 0.552795
6 7 0.333333
0 1 0.303538
4 5 0.200000
5 6 0.136364
7 8 0.000000
8 11 0.000000
In [ ]:
for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()
Out[ ]:
IsAlone Survived
0 0 0.505650
1 1 0.303538
In [ ]:
 
In [ ]:
 
In [ ]:
 

2.1 模型搭建¶

In [ ]:
def model_score(X,y):
    # 对数据集进行切割
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    # 查看训练集和测试集score值
    print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
    print("Testing set score: {:.2f}".format(lr.score(X_test, y_test)))

2.1.1 分类变量 用pd.get_dummies(data)¶

In [ ]:
# 取出所有的输入特征
data = df[['Pclass','Sex','Age','SibSp','Parch','Fare', 'Embarked']]
# 进行虚拟变量转换
data = pd.get_dummies(data)
data.head(1)
Out[ ]:
Pclass Age SibSp Parch Fare Sex_female Sex_male Embarked_C Embarked_Q Embarked_S
0 3 22.0 1 0 7.25 0 1 0 0 1
In [ ]:
model_score(data,df['Survived'])
Training set score: 0.80
Testing set score: 0.78
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

2.1.2 分类变量编码 用LabelEncoder()¶

In [ ]:
df = pd.read_csv('data/train_1.csv')
#分类变量编码
for feat in ['Sex', 'Embarked']:
    lbl = LabelEncoder()  
    label_dict = dict(zip(df[feat].unique(), range(df[feat].nunique())))
    df[feat + "_labelEncode"] = df[feat].map(label_dict)
    df[feat + "_labelEncode"] = lbl.fit_transform(df[feat].astype(str))
# 取出所有的输入特征
data = df[['Pclass','Sex_labelEncode','Age','SibSp','Parch','Fare', 'Embarked_labelEncode']]
data.head(1)
Out[ ]:
Pclass Sex_labelEncode Age SibSp Parch Fare Embarked_labelEncode
0 3 1 22.0 1 0 7.25 2
In [ ]:
model_score(data,df['Survived'])
Training set score: 0.81
Testing set score: 0.79
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

2.1.3 Age,Fare分箱¶

In [ ]:
df = pd.read_csv('data/train_2.csv')
df['AgeBand'] = pd.cut(df['Age'], 5,labels = [1,2,3,4,5])
df.groupby(['AgeBand','Survived'])['Survived'].count().unstack().plot(kind='bar',stacked='True')
Out[ ]:
<Axes: xlabel='AgeBand'>
In [ ]:
df['Fare'].describe()
Out[ ]:
count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64
In [ ]:
df['FareBand'] = pd.cut(df['Fare'],[0,8,16,32,100,513],labels = [1,2,3,4,5])
df.groupby(['FareBand','Survived'])['Survived'].count().unstack().plot(kind='bar',stacked='True')
Out[ ]:
<Axes: xlabel='FareBand'>
In [ ]:
data = df[['Pclass','Sex_labelEncode','AgeBand','SibSp','Parch','FareBand', 'Embarked_labelEncode']]
data.head(1)
Out[ ]:
Pclass Sex_labelEncode AgeBand SibSp Parch FareBand Embarked_labelEncode
0 3 1 2 1 0 1 2
In [ ]:
model_score(data,df['Survived'])
Training set score: 0.62
Testing set score: 0.61
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\arrays\base.py:513: RuntimeWarning: invalid value encountered in cast
  result = np.asarray(self, dtype=dtype)
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=2):
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\arrays\base.py:513: RuntimeWarning: invalid value encountered in cast
  result = np.asarray(self, dtype=dtype)
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\arrays\base.py:513: RuntimeWarning: invalid value encountered in cast
  result = np.asarray(self, dtype=dtype)

2.2 预测结果¶

2.2.1 Age均值填充,Age,Sex编码( 0.76794 )¶

In [ ]:
def preprocessing(df):
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    df['Cabin'] = df['Cabin'].fillna('NA')
    df['Embarked'] = df['Embarked'].fillna('S')
    for feat in ['Sex', 'Embarked']:
        lbl = LabelEncoder()  
        label_dict = dict(zip(df[feat].unique(), range(df[feat].nunique())))
        df[feat + "_labelEncode"] = df[feat].map(label_dict)
        df[feat + "_labelEncode"] = lbl.fit_transform(df[feat].astype(str))
    data = df[['Pclass','Sex_labelEncode','Age','SibSp','Parch','Fare', 'Embarked_labelEncode']]
    return data
In [ ]:
def training():
    df = pd.read_csv('data/train.csv')
    data = preprocessing(df)
    X=data
    y=df['Survived']
    X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, random_state=0)
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
    print("Testing set score: {:.2f}".format(lr.score(X_test, y_test)))
    return lr
In [ ]:
lr = training()
Training set score: 0.81
Testing set score: 0.79
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
In [ ]:
test = pd.read_csv('data/test.csv')
test_data = preprocessing(test)
pred = lr.predict(test_data)
result = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':pred})
result.to_csv('data/result.csv',index=False)

2.2.2 Age均值填充,Age,Sex编码,Age改成Ageband( 0.75358 )¶

In [ ]:
def preprocessing(df):
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    df['Cabin'] = df['Cabin'].fillna('NA')
    df['Embarked'] = df['Embarked'].fillna('S')
    for feat in ['Sex', 'Embarked']:
        lbl = LabelEncoder()  
        label_dict = dict(zip(df[feat].unique(), range(df[feat].nunique())))
        df[feat + "_labelEncode"] = df[feat].map(label_dict)
        df[feat + "_labelEncode"] = lbl.fit_transform(df[feat].astype(str))
    df['AgeBand'] = pd.cut(df['Age'], 5,labels = [1,2,3,4,5])
    data = df[['Pclass','Sex_labelEncode','AgeBand','SibSp','Parch','Fare', 'Embarked_labelEncode']]
    return data
In [ ]:
def training():
    df = pd.read_csv('data/train.csv')
    data = preprocessing(df)
    X=data
    y=df['Survived']
    X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, random_state=0)
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
    print("Testing set score: {:.2f}".format(lr.score(X_test, y_test)))
    return lr
In [ ]:
lr = training()
Training set score: 0.80
Testing set score: 0.78
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
In [ ]:
test = pd.read_csv('data/test.csv')
test_data = preprocessing(test)
pred = lr.predict(test_data)
result = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':pred})
result.to_csv('data/result.csv',index=False)

2.2.3 Age分类中位数填充,Name中提取称谓Title,isAlone( 0.76555 )¶

In [ ]:
def preprocessing(df):

    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Cabin'] = df['Cabin'].fillna('NA')
    df['Embarked'] = df['Embarked'].fillna('S')

    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    df['IsAlone'] = 0
    df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

    for feat in ['Sex', 'Embarked', 'Title']:
        lbl = LabelEncoder()  
        label_dict = dict(zip(df[feat].unique(), range(df[feat].nunique())))
        df[feat] = df[feat].map(label_dict)
        df[feat] = lbl.fit_transform(df[feat].astype(str))

    guess_ages = np.zeros((2,3))
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = df[(df['Sex'] == i) & \
                (df['Pclass'] == j+1)]['Age'].dropna()
            age_guess = guess_df.median()
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
    for i in range(0, 2):
        for j in range(0, 3):
            df.loc[ (df.Age.isnull()) & (df.Sex == i) & (df.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]
    df['Age'] = df['Age'].astype(int)

    df['AgeBand'] = pd.cut(df['Age'], 5,labels = [1,2,3,4,5])

    data = df[['Pclass','Sex','AgeBand','SibSp','Parch','Fare','Embarked','Title','IsAlone']]
    
    return data
In [ ]:
def training():
    df = pd.read_csv('data/train.csv')
    data = preprocessing(df)
    X=data
    y=df['Survived']
    lr = LogisticRegression()
    lr.fit(X,y)
    print("Training set score: {:.2f}".format(lr.score(X,y)))
    return lr
In [ ]:
lr = training()
Training set score: 0.81
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
In [ ]:
test = pd.read_csv('data/test.csv')
test_data = preprocessing(test)
pred = lr.predict(test_data)
result = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':pred})
result.to_csv('data/result.csv',index=False)

2.2.4 用RF,Age分类中位数填充,Name中提取称谓Title,isAlone( 0.72488 )¶

In [ ]:
def preprocessing(df):

    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Cabin'] = df['Cabin'].fillna('NA')
    df['Embarked'] = df['Embarked'].fillna('S')

    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    df['IsAlone'] = 0
    df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

    for feat in ['Sex', 'Embarked', 'Title']:
        lbl = LabelEncoder()  
        label_dict = dict(zip(df[feat].unique(), range(df[feat].nunique())))
        df[feat] = df[feat].map(label_dict)
        df[feat] = lbl.fit_transform(df[feat].astype(str))

    guess_ages = np.zeros((2,3))
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = df[(df['Sex'] == i) & \
                (df['Pclass'] == j+1)]['Age'].dropna()
            age_guess = guess_df.median()
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
    for i in range(0, 2):
        for j in range(0, 3):
            df.loc[ (df.Age.isnull()) & (df.Sex == i) & (df.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]
    df['Age'] = df['Age'].astype(int)

    df['AgeBand'] = pd.cut(df['Age'], 5,labels = [1,2,3,4,5])

    df['Age*Class'] = df.Age * df.Pclass

    data = df[['Pclass','Sex','AgeBand','SibSp','Parch','Fare','Embarked','Title','IsAlone','Age*Class']]
    
    return data
In [ ]:
def training():
    df = pd.read_csv('data/train.csv')
    data = preprocessing(df)
    X=data
    y=df['Survived']
    lr = RandomForestClassifier()
    lr.fit(X, y)
    print(cross_val_score(lr, X, y, cv=10).mean())
    return lr
In [ ]:
lr = training()
0.8159925093632958
In [ ]:
test = pd.read_csv('data/test.csv')
test_data = preprocessing(test)
pred = lr.predict(test_data)
result = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':pred})
result.to_csv('data/result.csv',index=False)

2.3 模型评估¶

In [ ]:
def preprocessing(df):

    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Cabin'] = df['Cabin'].fillna('NA')
    df['Embarked'] = df['Embarked'].fillna('S')

    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    df['IsAlone'] = 0
    df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

    for feat in ['Sex', 'Embarked', 'Title']:
        lbl = LabelEncoder()  
        label_dict = dict(zip(df[feat].unique(), range(df[feat].nunique())))
        df[feat] = df[feat].map(label_dict)
        df[feat] = lbl.fit_transform(df[feat].astype(str))

    guess_ages = np.zeros((2,3))
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = df[(df['Sex'] == i) & \
                (df['Pclass'] == j+1)]['Age'].dropna()
            age_guess = guess_df.median()
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
    for i in range(0, 2):
        for j in range(0, 3):
            df.loc[ (df.Age.isnull()) & (df.Sex == i) & (df.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]
    df['Age'] = df['Age'].astype(int)

    df['AgeBand'] = pd.cut(df['Age'], 5,labels = [1,2,3,4,5])

    df['Age*Class'] = df.Age * df.Pclass

    data = df[['Pclass','Sex','AgeBand','SibSp','Parch','Fare','Embarked','Title','IsAlone','Age*Class']]
    
    return data
In [ ]:
scores = pd.Series()
C:\Users\XLL\AppData\Local\Temp\ipykernel_772\3866007741.py:1: FutureWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
  scores = pd.Series()
In [ ]:
df = pd.read_csv('data/train.csv')
data = preprocessing(df)
X=data
y=df['Survived']

lr = LogisticRegression()
lr.fit(X, y)
scores['LogisticRegression'] = round(cross_val_score(lr, X, y, cv=10).mean()*100,2)

lr = SVC()
lr.fit(X, y)
scores['Support Vector Machines'] = round(cross_val_score(lr, X, y, cv=10).mean()*100,2)

lr = KNeighborsClassifier()
lr.fit(X, y)
scores['KNN'] = round(cross_val_score(lr, X, y, cv=10).mean()*100,2)

lr = GaussianNB()
lr.fit(X, y)
scores['Naive Bayes'] = round(cross_val_score(lr, X, y, cv=10).mean()*100,2)

lr = Perceptron()
lr.fit(X, y)
scores['Perceptron'] = round(cross_val_score(lr, X, y, cv=10).mean()*100,2)

lr = LinearSVC()
lr.fit(X, y)
scores['Linear Support Vector Machines'] = round(cross_val_score(lr, X, y, cv=10).mean()*100,2)

lr = SGDClassifier()
lr.fit(X, y)
scores['Stochastic Gradient Descent'] = round(cross_val_score(lr, X, y, cv=10).mean()*100,2)

lr = DecisionTreeClassifier()
lr.fit(X, y)
scores['Decision Tree'] = round(cross_val_score(lr, X, y, cv=10).mean()*100,2)

lr = RandomForestClassifier()
lr.fit(X, y)
scores['Random Forest'] = round(cross_val_score(lr, X, y, cv=10).mean()*100,2)
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
c:\Users\XLL\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\svm\_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn(
In [ ]:
models = scores.reset_index()
models.columns = ['model', 'score']
models = models.sort_values(by='score', ascending=False)
models
Out[ ]:
model score
8 Random Forest 81.82
0 LogisticRegression 80.59
3 Naive Bayes 80.59
7 Decision Tree 79.13
6 Stochastic Gradient Descent 77.1
4 Perceptron 72.72
2 KNN 70.94
1 Support Vector Machines 70.5
5 Linear Support Vector Machines 66.77